Skip to content

Commit 3e0602b

Browse files
authored
Merge pull request #277 from firecrawl/codex/research-mcp-tools
[codex] Expose research MCP tools
2 parents 126ea92 + c4002df commit 3e0602b

2 files changed

Lines changed: 94 additions & 74 deletions

File tree

src/index.ts

Lines changed: 4 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -23,37 +23,9 @@ interface SessionData {
2323
* instead of the shared server IP.
2424
*/
2525
keylessClientIp?: string;
26-
/**
27-
* Whether the (experimental) research tools are exposed for this session.
28-
* Enabled locally via `FIRECRAWL_RESEARCH=true`, or per-request via the
29-
* `?research=true` query param on the MCP endpoint.
30-
*/
31-
research?: boolean;
3226
[key: string]: unknown;
3327
}
3428

35-
/**
36-
* Decide whether the research tools should be visible for a session.
37-
* Local/stdio/self-hosted: gated by `FIRECRAWL_RESEARCH=true`.
38-
* Remote (HTTP): additionally enabled by a `?research=true` query param on the
39-
* incoming MCP request URL.
40-
*/
41-
function isResearchEnabled(request?: { url?: string }): boolean {
42-
if (process.env.FIRECRAWL_RESEARCH === 'true') return true;
43-
const url = request?.url;
44-
if (url) {
45-
try {
46-
const research = new URL(url, 'http://localhost').searchParams.get(
47-
'research'
48-
);
49-
if (research === 'true') return true;
50-
} catch {
51-
// malformed URL — fall through to disabled
52-
}
53-
}
54-
return false;
55-
}
56-
5729
function normalizeHeader(
5830
value: string | string[] | undefined
5931
): string | undefined {
@@ -290,7 +262,6 @@ const server = new FastMCP<SessionData>({
290262
headers: IncomingHttpHeaders;
291263
url?: string;
292264
}): Promise<SessionData> => {
293-
const research = isResearchEnabled(request);
294265
// FastMCP invokes `authenticate(undefined)` for the stdio transport
295266
// because there is no HTTP request context. Without this null guard,
296267
// accessing `request.headers` throws a TypeError, FastMCP silently
@@ -317,13 +288,13 @@ const server = new FastMCP<SessionData>({
317288
clientIp &&
318289
(await keylessEligible(clientIp))
319290
) {
320-
return { firecrawlApiKey: undefined, research, keylessClientIp: clientIp };
291+
return { firecrawlApiKey: undefined, keylessClientIp: clientIp };
321292
}
322293
throw new Error(
323294
'Firecrawl credentials required: OAuth access token (Authorization: Bearer fco_...) or API key (x-firecrawl-api-key)'
324295
);
325296
}
326-
return { firecrawlApiKey: headerCred, research };
297+
return { firecrawlApiKey: headerCred };
327298
}
328299
329300
const credential = headerCred ?? envCred;
@@ -352,7 +323,7 @@ const server = new FastMCP<SessionData>({
352323
process.exit(1);
353324
}
354325
355-
return { firecrawlApiKey: credential, research };
326+
return { firecrawlApiKey: credential };
356327
},
357328
// Lightweight health endpoint for LB checks
358329
health: {
@@ -2136,21 +2107,6 @@ if (
21362107
}
21372108

21382109
registerMonitorTools(server);
2139-
2140-
// Research tools gating. FastMCP's `canAccess` is only honored on the HTTP
2141-
// transport (the stdio path exposes every registered tool regardless), so we
2142-
// split the two cases:
2143-
// - HTTP (cloud / SSE_LOCAL / HTTP_STREAMABLE_SERVER): always register; each
2144-
// tool's `canAccess` hides it unless the session has research enabled
2145-
// (`FIRECRAWL_RESEARCH=true` env or `?research=true` on the request).
2146-
// - stdio (local): register only when `FIRECRAWL_RESEARCH=true`, since
2147-
// `canAccess` cannot hide them there.
2148-
const isHttpTransport =
2149-
process.env.CLOUD_SERVICE === 'true' ||
2150-
process.env.SSE_LOCAL === 'true' ||
2151-
process.env.HTTP_STREAMABLE_SERVER === 'true';
2152-
if (isHttpTransport || process.env.FIRECRAWL_RESEARCH === 'true') {
2153-
registerResearchTools(server, getClient);
2154-
}
2110+
registerResearchTools(server, getClient);
21552111

21562112
await server.start(args);

src/research.ts

Lines changed: 90 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
11
/**
22
* Firecrawl Research tools (experimental).
33
*
4-
* Thin MCP wrappers over the `/v2/research/*` endpoints (arXiv papers + GitHub
5-
* history/readmes). These tools are hidden unless research is enabled for the
6-
* session — locally via `FIRECRAWL_RESEARCH=true`, or remotely via the
7-
* `?research=true` query param on the MCP endpoint (see `isResearchEnabled` in
8-
* index.ts, which sets `session.research`).
4+
* Thin MCP wrappers over the `/v2/search/research/*` endpoints (arXiv papers + GitHub
5+
* history/readmes).
96
*
107
* The installed `@mendable/firecrawl-js` predates the SDK's `research` client,
118
* so we call the endpoints directly through the SDK's HTTP layer (auth +
@@ -18,7 +15,6 @@ import { z } from 'zod';
1815

1916
interface SessionData {
2017
firecrawlApiKey?: string;
21-
research?: boolean;
2218
[key: string]: unknown;
2319
}
2420

@@ -36,7 +32,7 @@ type ClientLike = {
3632
// the callback loosely and narrow to `ClientLike` at each call site.
3733
type GetClient = (session?: SessionData) => unknown;
3834

39-
const BASE = '/v2/research';
35+
const BASE = '/v2/search/research';
4036

4137
/** Append a value (or repeated array values) to a URLSearchParams instance. */
4238
function appendParam(
@@ -73,18 +69,22 @@ const MAX_AFFIL_CHARS = 60;
7369
const MAX_AUTHORS_LINE_CHARS = 400;
7470

7571
interface PaperHit {
76-
paper_id?: string;
72+
paperId?: string;
73+
primaryId?: string;
7774
ids?: Record<string, string[]>;
7875
title?: string;
7976
abstract?: string;
8077
// Search/metadata responses give a comma-joined string; some shapes give the
8178
// structured form — handle both.
8279
authors?: string | { name: string; affiliation?: string }[];
80+
categories?: string[];
81+
createdDate?: string;
82+
updateDate?: string;
8383
}
8484

85-
/** Best display id for a paper: its arXiv id, falling back to the canonical id. */
85+
/** Display id supplied by the API, already ordered for citation/fetch use. */
8686
function displayId(p: PaperHit): string {
87-
return p.ids?.arxiv?.[0] ?? p.paper_id ?? '?';
87+
return p.primaryId ?? 'missing-primary-id';
8888
}
8989

9090
/** Format the authors line, accepting either the string or structured form. */
@@ -122,7 +122,7 @@ function fmtHits(results?: PaperHit[]): string {
122122
if (!results || results.length === 0) return '(no results)';
123123
return results
124124
.map((r) => {
125-
const lines = [`[${displayId(r)}] ${r.title ?? '(untitled)'}`];
125+
const lines = [`## [${displayId(r)}] ${r.title ?? '(untitled)'}`];
126126
const authors = fmtAuthors(r.authors);
127127
if (authors) lines.push(authors);
128128
lines.push(
@@ -135,6 +135,40 @@ function fmtHits(results?: PaperHit[]): string {
135135
.join('\n\n');
136136
}
137137

138+
function fmtPaperMetadata(paper?: PaperHit): string {
139+
if (!paper) return '(paper not found)';
140+
const lines = [`# ${paper.title ?? '(untitled)'}`];
141+
lines.push('');
142+
lines.push(`Paper ID: ${paper.paperId ?? '?'}`);
143+
144+
const ids = Object.entries(paper.ids ?? {})
145+
.flatMap(([namespace, values]) =>
146+
values.map((value) => `${namespace}:${value}`)
147+
)
148+
.join(', ');
149+
if (ids) lines.push(`IDs: ${ids}`);
150+
151+
const authors = fmtAuthors(paper.authors);
152+
if (authors) lines.push(authors);
153+
154+
if (paper.categories?.length) {
155+
lines.push(`Categories: ${paper.categories.join(', ')}`);
156+
}
157+
158+
const dates = [
159+
paper.createdDate ? `created ${paper.createdDate}` : '',
160+
paper.updateDate ? `updated ${paper.updateDate}` : '',
161+
]
162+
.filter(Boolean)
163+
.join('; ');
164+
if (dates) lines.push(`Dates: ${dates}`);
165+
166+
lines.push('');
167+
lines.push('## Abstract');
168+
lines.push((paper.abstract || '(no abstract)').replace(/\s+/g, ' '));
169+
return lines.join('\n');
170+
}
171+
138172
// Cap GitHub matched content so a page of results stays within the MCP
139173
// output-token limit. Higher than abstracts since issue/PR threads carry the
140174
// signal (repro steps, stack traces) the agent actually needs to verify.
@@ -193,18 +227,13 @@ function fmtGithub(results?: GitHubItem[]): string {
193227
.join('\n\n');
194228
}
195229

196-
/** Only present these tools when the session has research enabled. */
197-
const canAccess = (session?: SessionData): boolean =>
198-
session?.research === true;
199-
200230
export function registerResearchTools(
201231
server: FastMCP<SessionData>,
202232
getClient: GetClient
203233
): void {
204234
// --- search_papers ---
205235
server.addTool({
206236
name: 'firecrawl_research_search_papers',
207-
canAccess,
208237
annotations: {
209238
title: 'Search arXiv papers',
210239
readOnlyHint: true,
@@ -270,10 +299,42 @@ export function registerResearchTools(
270299
},
271300
});
272301

302+
// --- inspect_paper ---
303+
server.addTool({
304+
name: 'firecrawl_research_inspect_paper',
305+
annotations: {
306+
title: 'Inspect a paper',
307+
readOnlyHint: true,
308+
openWorldHint: true,
309+
},
310+
description:
311+
'Fetch canonical metadata for one paper by primaryId or canonical paperId. ' +
312+
'Use this after search/related results when you need the full title, abstract, authors, ' +
313+
'categories, source ids, and dates rendered as markdown.',
314+
parameters: z.object({
315+
paperId: z
316+
.string()
317+
.min(1)
318+
.describe(
319+
'Canonical paperId or primaryId such as `arxiv:1706.03762`, `pmcid:PMC12530322`, `pmid:40953549`, or `doi:10.1016/j.neunet.2025.108095`.'
320+
),
321+
}),
322+
execute: async (
323+
args: unknown,
324+
{ session }: { session?: SessionData; log: Logger }
325+
): Promise<string> => {
326+
const { paperId } = args as { paperId: string };
327+
const client = getClient(session) as ClientLike;
328+
const res = await client.http.get<{ paper?: PaperHit }>(
329+
`${BASE}/papers/${encodeURIComponent(paperId)}`
330+
);
331+
return fmtPaperMetadata(res.data?.paper);
332+
},
333+
});
334+
273335
// --- related_papers ---
274336
server.addTool({
275337
name: 'firecrawl_research_related_papers',
276-
canAccess,
277338
annotations: {
278339
title: 'Find related arXiv papers',
279340
readOnlyHint: true,
@@ -322,7 +383,7 @@ export function registerResearchTools(
322383
const client = getClient(session) as ClientLike;
323384
const res = await client.http.get<{
324385
results?: PaperHit[];
325-
pool_size?: number;
386+
poolSize?: number;
326387
note?: string | null;
327388
}>(
328389
withQuery(
@@ -331,16 +392,15 @@ export function registerResearchTools(
331392
)
332393
);
333394
const note = res.data?.note ? `\nnote: ${res.data.note}` : '';
334-
return `${fmtHits(res.data?.results)}\n(pool_size=${res.data?.pool_size ?? 0})${note}`;
395+
return `${fmtHits(res.data?.results)}\n(poolSize=${res.data?.poolSize ?? 0})${note}`;
335396
},
336397
});
337398

338399
// --- read_paper ---
339400
server.addTool({
340401
name: 'firecrawl_research_read_paper',
341-
canAccess,
342402
annotations: {
343-
title: 'Read an arXiv paper',
403+
title: 'Read a paper',
344404
readOnlyHint: true,
345405
openWorldHint: true,
346406
destructiveHint: false,
@@ -351,7 +411,12 @@ export function registerResearchTools(
351411
"reject it (e.g. 'does this paper actually use technique X / report a score on benchmark Y'). " +
352412
"Returns the best-matching passages, or a notice if the paper's full text is unavailable.",
353413
parameters: z.object({
354-
arxiv_id: z.string().min(1),
414+
paperId: z
415+
.string()
416+
.min(1)
417+
.describe(
418+
'Canonical paperId or primaryId such as `arxiv:1706.03762`, `pmcid:PMC12530322`, `pmid:40953549`, or `doi:10.1016/j.neunet.2025.108095`.'
419+
),
355420
question: z.string().min(1),
356421
k: z
357422
.number()
@@ -365,8 +430,8 @@ export function registerResearchTools(
365430
args: unknown,
366431
{ session }: { session?: SessionData; log: Logger }
367432
): Promise<string> => {
368-
const { arxiv_id, question, k } = args as {
369-
arxiv_id: string;
433+
const { paperId, question, k } = args as {
434+
paperId: string;
370435
question: string;
371436
k?: number;
372437
};
@@ -375,7 +440,7 @@ export function registerResearchTools(
375440
appendParam(params, 'k', k);
376441
const client = getClient(session) as ClientLike;
377442
const res = await client.http.get<{ passages?: { text: string }[] }>(
378-
withQuery(`${BASE}/papers/${encodeURIComponent(arxiv_id)}`, params)
443+
withQuery(`${BASE}/papers/${encodeURIComponent(paperId)}`, params)
379444
);
380445
const passages = res.data?.passages ?? [];
381446
return passages.length
@@ -387,7 +452,6 @@ export function registerResearchTools(
387452
// --- search_github ---
388453
server.addTool({
389454
name: 'firecrawl_research_search_github',
390-
canAccess,
391455
annotations: {
392456
title: 'Search GitHub history',
393457
readOnlyHint: true,

0 commit comments

Comments
 (0)